Below example was made by R.
I just make the example again using jupyter notebook.
library(GEOquery)
library(affy)
library(dendextend)
library(biomaRt)
library(limma)
# Below code should be executed to download files.
#getGEOSuppFiles("GSE59880")
#untar("GSE59880/GSE59880_RAW.tar", exdir = "GSE59880/CEL")
list.files("GSE59880")
#list.files("GSE59880/CEL")
I use 'tar' and 'gzip' commands to uncompress files manually instead of '/bin/gtar' command.
celfiles <- list.files("GSE59880/CEL")
celfiles <- unlist(celfiles)
rawData <- ReadAffy(filenames = celfiles,celfile.path='/home/vagrant/maybegit/tensorflow-exercise/R/GSE59880/CEL/')
'/home/vagrant/maybegit/tensorflow-exercise/R/GSE59880/CEL/' is my directory path.
edata_raw <- exprs(rawData)
dim(edata_raw)
edata_raw[1:10,]
sum(is.na(edata_raw))
par(mfrow=c(1,2))
boxplot(log(edata_raw[,1:15]), main="old", col=2,range=0 )
boxplot(log(edata_raw[,16:30]), main="young", col=2,range=0 )
par(mfrow=c(1,1))
hist(log(edata_raw[,1]))
mm = log(edata_raw[0:50000,1]+1) - log(edata_raw[0:50000,2]+1)
aa = log(edata_raw[0:50000,1]+1) + log(edata_raw[0:50000,2]+1)
plot(aa,mm,col=2)
normData <- rma(rawData)
normData
dim(normData)
head(exprs(normData))
edata <- exprs(normData)
boxplot(edata[,1:30],main="expression data",col=2)
geoMat <- getGEO("GSE59880",destdir = "/home/vagrant/maybegit/tensorflow-exercise/R/GSE59880")
pD.all <- pData(geoMat[[1]])
head(pD.all)
names(pD.all)
##pD <- pD.all[, c("title", "geo_accession", "characteristics_ch1.1","characteristics_ch1.2","characteristics_ch1.3","characteristics_ch1.4","platform_id" )]
pD <- pD.all[, c("title", "geo_accession", "characteristics_ch1.1","characteristics_ch1.2")]
names(pD)[c(3,4)] <- c("age", "fitness")
## head(pD)
##pD$age
pD$age <- sub("^age: ","", pD$age)
##pD$age
##pD$fitness
pD$fitness <- sub("^aerobic fitness: ", "", pD$fitness)
##pD$fitness
changeName <- function(x) {
print("function starts.... object length is")
print(length(x))
for (i in 1:length(x)) {
x[i] <- substr(x[i], start=1, stop=10)
print(x[i])
}
return(x)
}
sampleNames(normData) <- changeName(sampleNames(normData))
sampleNames(normData)
group <- ifelse((pD$age)>50, "old", "young")
pD <- cbind(pD,group)
pData(normData) <- pD
mod = model.matrix(~ pData(normData)$group)
fit_limma = lmFit(exprs(normData),mod)
ebayes_limma = eBayes(fit_limma)
limma_pvals = topTable(ebayes_limma,adjust.method="BH",sort.by="none",number=dim(edata)[1])
dim(limma_pvals)
head(limma_pvals[limma_pvals$adj.P.Val < 0.05,])
sum(limma_pvals$adj.P.Val < 0.05)
cgenes <- rownames(limma_pvals[limma_pvals$adj.P.Val < 0.05,])
length(cgenes)
limma_pvals_test2 = topTable(ebayes_limma,adjust.method="hochberg",sort.by="logFC",number=dim(edata)[1])
tgenes <- rownames(limma_pvals_test2[limma_pvals_test2$adj.P.Val < 0.05,])
length(tgenes)
#sum(tgenes %in% genes)
#library(xlsx)
#library(xsv)
data <- read.csv("/home/vagrant/maybegit/tensorflow-exercise/R/GSE59880/aging_genes.csv")
head(data)
genes <- as.vector(as.matrix(data))
sum(cgenes %in% genes)
genes_age <- exprs(normData)[rownames(exprs(normData)) %in% genes,]
head(genes_age)
boxplot(genes_age)
heatmap(genes_age)
dist1 = dist(t(genes_age))
hclust1 = hclust(dist1)
plot(hclust1)
dend = as.dendrogram(hclust1)
dend = color_labels(hclust1,2,col=1:2)
plot(dend)
Sys.info()